! (C) Copyright 1995- ECMWF.
! (C) Copyright 1995- Meteo-France.
! 
! This software is licensed under the terms of the Apache Licence Version 2.0
! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
! In applying this licence, ECMWF does not waive the privileges and immunities
! granted to it by virtue of its status as an intergovernmental organisation
! nor does it submit to any jurisdiction.
!

MODULE TRLTOG_MOD_dp

PUBLIC TRLTOG
PRIVATE TRLTOG_PROLOG, TRLTOG_COMM, TRLTOG_COMM_HEAP, TRLTOG_COMM_STACK

CONTAINS

SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
 &PGP,PGPUV,PGP3A,PGP3B,PGP2)

!**** *TRLTOG * - head routine for transposition of grid point data from latitudinal
!                 to column structure (this takes place between inverse
!                 FFT and grid point calculations)
!                 TRLTOG is the inverse of TRGTOL

!**   Interface.
!     ----------
!        *call* *TRLTOG(...)

!        Explicit arguments :
!        --------------------
!           PGLAT    -  Latitudinal data ready for direct FFT (input)
!           PGP    -  Blocked grid point data    (output)
!           KVSET    - "v-set" for each field      (input)

!        Implicit arguments :
!        --------------------

!     Method.
!     -------
!        See documentation

!     Externals.
!     ----------

!     Reference.
!     ----------
!        ECMWF Research Department documentation of the IFS

!     Author.
!     -------
!        R. El Khatib *Meteo-France*

!     Modifications.
!     --------------
!        Original  : 18-Aug-2014 from trltog
!        R. El Khatib 09-Sep-2020 NSTACK_MEMORY_TR
!     ------------------------------------------------------------------

USE EC_PARKIND  ,ONLY : JPIM     ,JPRD
USE YOMHOOK   ,ONLY : LHOOK,   DR_HOOK, JPHOOK

USE TPM_GEN         ,ONLY : NSTACK_MEMORY_TR
USE TPM_DISTR       ,ONLY : D, NPRTRNS, NPROC
USE TPM_TRANS_dp       ,ONLY : NGPBLKS

IMPLICIT NONE

INTEGER(KIND=JPIM),INTENT(IN)  :: KF_FS,KF_GP
INTEGER(KIND=JPIM),INTENT(IN)  :: KF_SCALARS_G
REAL(KIND=JPRD),INTENT(IN)     :: PGLAT(KF_FS,D%NLENGTF)
INTEGER(KIND=JPIM), INTENT(IN) :: KVSET(KF_GP)
INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP(:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGPUV(:,:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP3A(:,:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP3B(:,:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP2(:,:,:)

INTEGER(KIND=JPIM) :: ISENDCOUNT
INTEGER(KIND=JPIM) :: IRECVCOUNT
INTEGER(KIND=JPIM) :: INSEND
INTEGER(KIND=JPIM) :: INRECV
INTEGER(KIND=JPIM) :: ISENDTOT (NPROC)
INTEGER(KIND=JPIM) :: IRECVTOT (NPROC)
INTEGER(KIND=JPIM) :: ISEND    (NPROC)
INTEGER(KIND=JPIM) :: IRECV    (NPROC)
INTEGER(KIND=JPIM) :: IINDEX(D%NLENGTF)
INTEGER(KIND=JPIM) :: INDOFF(NPROC)
INTEGER(KIND=JPIM) :: IGPTRSEND(2,NGPBLKS,NPRTRNS)
INTEGER(KIND=JPIM) :: ISETAL(NPROC), ISETBL(NPROC), ISETWL(NPROC), ISETVL(NPROC)

REAL(KIND=JPHOOK) :: ZHOOK_HANDLE

!     ------------------------------------------------------------------

IF (LHOOK) CALL DR_HOOK('TRLTOG',0,ZHOOK_HANDLE)

CALL TRLTOG_PROLOG(KF_FS,KF_GP,KVSET,&
 & ISENDCOUNT,IRECVCOUNT,INSEND,INRECV,ISENDTOT,IRECVTOT,ISEND,IRECV,IINDEX,INDOFF,IGPTRSEND, &
 & ISETAL,ISETBL,ISETWL,ISETVL)
IF (NSTACK_MEMORY_TR==0) THEN
  CALL TRLTOG_COMM_HEAP(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET, &
   & ISENDCOUNT,IRECVCOUNT,INSEND,INRECV,ISENDTOT,IRECVTOT,ISEND,IRECV,IINDEX,INDOFF,IGPTRSEND, &
   & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2,&
   & ISETAL,ISETBL,ISETWL,ISETVL)
ELSE
  CALL TRLTOG_COMM_STACK(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET, &
   & ISENDCOUNT,IRECVCOUNT,INSEND,INRECV,ISENDTOT,IRECVTOT,ISEND,IRECV,IINDEX,INDOFF,IGPTRSEND, &
   & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2,&
   & ISETAL,ISETBL,ISETWL,ISETVL)
ENDIF

IF (LHOOK) CALL DR_HOOK('TRLTOG',1,ZHOOK_HANDLE)

!     ------------------------------------------------------------------

END SUBROUTINE TRLTOG

SUBROUTINE TRLTOG_PROLOG(KF_FS,KF_GP,KVSET,&
 & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND, &
 & KSETAL,KSETBL,KSETWL,KSETVL)

!**** *TRLTOG_PROLOG * - prolog for transposition of grid point data from latitudinal
!                 to column structure (this takes place between inverse
!                 FFT and grid point calculations) : the purpose is essentially 
!                 to compute the size of communication buffers in order to enable
!                 the use of automatic arrays later.
!                 TRLTOG_PROLOG is the inverse of TRGTOL_PROLOG

!     Purpose.
!     --------

!**   Interface.
!     ----------
!        *call* *TRLTOG_PROLOG(...)

!        Explicit arguments :
!        --------------------
!           KVSET    - "v-set" for each field      (input)

!        Implicit arguments :
!        --------------------

!     Method.
!     -------
!        See documentation

!     Externals.
!     ----------

!     Reference.
!     ----------
!        ECMWF Research Department documentation of the IFS

!     Author.
!     -------
!        R. El Khatib *Meteo-France*

!     Modifications.
!     --------------
!        Original  : 18-Aug-2014 from trltog
!     ------------------------------------------------------------------

USE EC_PARKIND  ,ONLY : JPIM

USE TPM_DISTR       ,ONLY : D, MYSETW, NPRTRNS, MYPROC, NPROC
USE TPM_TRANS_dp       ,ONLY : NGPBLKS

USE INIGPTR_MOD_dp     ,ONLY : INIGPTR
USE PE2SET_MOD      ,ONLY : PE2SET
!

IMPLICIT NONE

INTEGER(KIND=JPIM),INTENT(IN)  :: KF_FS,KF_GP
INTEGER(KIND=JPIM),INTENT(IN)  :: KVSET(KF_GP)

INTEGER(KIND=JPIM), INTENT(OUT) :: KSENDCOUNT
INTEGER(KIND=JPIM), INTENT(OUT) :: KRECVCOUNT
INTEGER(KIND=JPIM), INTENT(OUT) :: KNSEND
INTEGER(KIND=JPIM), INTENT(OUT) :: KNRECV
INTEGER(KIND=JPIM), INTENT(OUT) :: KSENDTOT (NPROC)
INTEGER(KIND=JPIM), INTENT(OUT) :: KRECVTOT (NPROC)
INTEGER(KIND=JPIM), INTENT(OUT) :: KSEND    (NPROC)
INTEGER(KIND=JPIM), INTENT(OUT) :: KRECV    (NPROC)
INTEGER(KIND=JPIM), INTENT(OUT) :: KINDEX(D%NLENGTF)
INTEGER(KIND=JPIM), INTENT(OUT) :: KNDOFF(NPROC)
INTEGER(KIND=JPIM), INTENT(OUT) :: KGPTRSEND(2,NGPBLKS,NPRTRNS)
INTEGER(KIND=JPIM), INTENT(OUT) :: KSETAL(NPROC)
INTEGER(KIND=JPIM), INTENT(OUT) :: KSETBL(NPROC)
INTEGER(KIND=JPIM), INTENT(OUT) :: KSETWL(NPROC)
INTEGER(KIND=JPIM), INTENT(OUT) :: KSETVL(NPROC)

INTEGER(KIND=JPIM) :: IGPTRRECV(NPRTRNS)
INTEGER(KIND=JPIM) :: IFIRSTLAT, IGL, IGLL, ILASTLAT, IPOS, ISETA, ISETB, ISETV
INTEGER(KIND=JPIM) :: ISEND, JFLD, JGL, JL, ISETW, JROC, J
INTEGER(KIND=JPIM) :: INDOFFX,IBUFLENS,IBUFLENR

!     ------------------------------------------------------------------

!*       0.    Some initializations
!              --------------------

CALL GSTATS(1806,0)

CALL INIGPTR(KGPTRSEND,IGPTRRECV)

INDOFFX  = 0
IBUFLENS = 0
IBUFLENR = 0
KNRECV = 0
KNSEND = 0

DO JROC=1,NPROC

  CALL PE2SET(JROC,KSETAL(JROC),KSETBL(JROC),KSETWL(JROC),KSETVL(JROC))
  ISEND      = JROC
  ISETA=KSETAL(JROC)
  ISETB=KSETBL(JROC)
  ISETW=KSETWL(JROC)
  ISETV=KSETVL(JROC)
!             count up expected number of fields
  IPOS = 0
  DO JFLD=1,KF_GP
    IF(KVSET(JFLD) == ISETV .OR. KVSET(JFLD) == -1) IPOS = IPOS+1
  ENDDO
  KRECVTOT(JROC) = IGPTRRECV(ISETW)*IPOS
  IF(KRECVTOT(JROC) > 0 .AND. MYPROC /= JROC) THEN
    KNRECV = KNRECV + 1
    KRECV(KNRECV)=JROC
  ENDIF

  IF( JROC /= MYPROC) IBUFLENR = MAX(IBUFLENR,KRECVTOT(JROC))

  IFIRSTLAT = MAX(D%NPTRLS(MYSETW),D%NFRSTLAT(ISETA))
  ILASTLAT  = MIN(D%NPTRLS(MYSETW)+D%NULTPP(MYSETW)-1,D%NLSTLAT(ISETA))

  IPOS = 0
  DO JGL=IFIRSTLAT,ILASTLAT
    IGL  = D%NPTRFRSTLAT(ISETA)+JGL-D%NFRSTLAT(ISETA)
    IPOS = IPOS+D%NONL(IGL,ISETB)
  ENDDO

  KSENDTOT(JROC) = IPOS*KF_FS
  IF( JROC /= MYPROC) THEN
    IBUFLENS = MAX(IBUFLENS,KSENDTOT(JROC))
    IF(KSENDTOT(JROC) > 0) THEN
      KNSEND = KNSEND+1
      KSEND(KNSEND)=JROC
    ENDIF
  ENDIF

  IF(IPOS > 0) THEN
    KNDOFF(JROC) = INDOFFX
    INDOFFX = INDOFFX+IPOS
    IPOS = 0
    DO JGL=IFIRSTLAT,ILASTLAT
      IGL  = D%NPTRFRSTLAT(ISETA)+JGL-D%NFRSTLAT(ISETA)
      IGLL = JGL-D%NPTRLS(MYSETW)+1
      DO JL=D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL),&
       &D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL)+D%NONL(IGL,ISETB)-1
        IPOS = IPOS+1
        KINDEX(IPOS+KNDOFF(JROC)) = JL
      ENDDO
    ENDDO
  ENDIF
ENDDO

KSENDCOUNT=0
KRECVCOUNT=0
DO J=1,NPROC
  KSENDCOUNT=MAX(KSENDCOUNT,KSENDTOT(J))
  KRECVCOUNT=MAX(KRECVCOUNT,KRECVTOT(J))
ENDDO

CALL GSTATS(1806,1)

END SUBROUTINE TRLTOG_PROLOG

SUBROUTINE TRLTOG_COMM_HEAP(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,&
 & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,&
 & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2, &
 & KSETAL, KSETBL,KSETWL,KSETVL)

USE EC_PARKIND  ,ONLY : JPIM     ,JPRD
USE TPM_DISTR ,ONLY : D, NPRTRNS, NPROC
USE TPM_TRANS_dp ,ONLY : NGPBLKS

IMPLICIT NONE

INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS,KF_GP
REAL(KIND=JPRD),INTENT(IN)     :: PGLAT(KF_FS,D%NLENGTF)
INTEGER(KIND=JPIM), INTENT(IN) :: KVSET(KF_GP)
INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS_G
INTEGER(KIND=JPIM), INTENT(IN) :: KSENDCOUNT
INTEGER(KIND=JPIM), INTENT(IN) :: KRECVCOUNT
INTEGER(KIND=JPIM), INTENT(IN) :: KNSEND
INTEGER(KIND=JPIM), INTENT(IN) :: KNRECV
INTEGER(KIND=JPIM), INTENT(IN) :: KSENDTOT (NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KRECVTOT (NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KSEND    (NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KRECV    (NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KINDEX(D%NLENGTF)
INTEGER(KIND=JPIM), INTENT(IN) :: KNDOFF(NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KGPTRSEND(2,NGPBLKS,NPRTRNS)
INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP(:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGPUV(:,:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP3A(:,:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP3B(:,:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP2(:,:,:)
INTEGER(KIND=JPIM), INTENT(IN) :: KSETAL(NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KSETBL(NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KSETWL(NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KSETVL(NPROC)

REAL(KIND=JPRD), ALLOCATABLE, SAVE :: ZCOMBUFS_HEAP(:,:)
REAL(KIND=JPRD), ALLOCATABLE, SAVE :: ZCOMBUFR_HEAP(:,:)
INTEGER(KIND=JPIM), SAVE :: INRECV_PREV = -1
INTEGER(KIND=JPIM), SAVE :: INSEND_PREV = -1
INTEGER(KIND=JPIM), SAVE :: IRECVCOUNT_PREV = -1
INTEGER(KIND=JPIM), SAVE :: ISENDCOUNT_PREV = -1

IF ( .NOT. ALLOCATED(ZCOMBUFS_HEAP) ) THEN
  ALLOCATE(ZCOMBUFS_HEAP(-1:KSENDCOUNT,KNSEND))
  ISENDCOUNT_PREV = KSENDCOUNT
  INSEND_PREV = KNSEND
ELSEIF ( KSENDCOUNT .NE. ISENDCOUNT_PREV .OR. KNSEND .NE. INSEND_PREV ) THEN
  DEALLOCATE(ZCOMBUFS_HEAP)
  ALLOCATE(ZCOMBUFS_HEAP(-1:KSENDCOUNT,KNSEND))
  ISENDCOUNT_PREV = KSENDCOUNT
  INSEND_PREV = KNSEND
ENDIF

! Now, force the OS to allocate this shared array right now, not when it starts to be used which is
! an OPEN-MP loop, that would cause a threads synchronization lock :
IF (KNSEND > 0 .AND. KSENDCOUNT >= -1) ZCOMBUFS_HEAP(-1,1) = HUGE(1._JPRD)

IF ( .NOT. ALLOCATED(ZCOMBUFR_HEAP) ) THEN
  ALLOCATE(ZCOMBUFR_HEAP(-1:KRECVCOUNT,KNRECV))
  IRECVCOUNT_PREV = KRECVCOUNT
  INRECV_PREV = KNRECV
ELSEIF ( KRECVCOUNT .NE. IRECVCOUNT_PREV .OR. KNRECV .NE. INRECV_PREV ) THEN
  DEALLOCATE(ZCOMBUFR_HEAP)
  ALLOCATE(ZCOMBUFR_HEAP(-1:KRECVCOUNT,KNRECV))
  IRECVCOUNT_PREV = KRECVCOUNT
  INRECV_PREV = KNRECV
ENDIF

CALL TRLTOG_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,&
 & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,&
 & ZCOMBUFS_HEAP,ZCOMBUFR_HEAP, &
 & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2, &
 & KSETAL, KSETBL,KSETWL,KSETVL)

END SUBROUTINE TRLTOG_COMM_HEAP

SUBROUTINE TRLTOG_COMM_STACK(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,&
 & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,&
 & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2, &
 & KSETAL, KSETBL,KSETWL,KSETVL)

USE EC_PARKIND  ,ONLY : JPIM     ,JPRD
USE TPM_DISTR ,ONLY : D, NPRTRNS, NPROC
USE TPM_TRANS_dp ,ONLY : NGPBLKS

IMPLICIT NONE

INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS,KF_GP
REAL(KIND=JPRD),INTENT(IN)     :: PGLAT(KF_FS,D%NLENGTF)
INTEGER(KIND=JPIM), INTENT(IN) :: KVSET(KF_GP)
INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS_G
INTEGER(KIND=JPIM), INTENT(IN) :: KSENDCOUNT
INTEGER(KIND=JPIM), INTENT(IN) :: KRECVCOUNT
INTEGER(KIND=JPIM), INTENT(IN) :: KNSEND
INTEGER(KIND=JPIM), INTENT(IN) :: KNRECV
INTEGER(KIND=JPIM), INTENT(IN) :: KSENDTOT (NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KRECVTOT (NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KSEND    (NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KRECV    (NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KINDEX(D%NLENGTF)
INTEGER(KIND=JPIM), INTENT(IN) :: KNDOFF(NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KGPTRSEND(2,NGPBLKS,NPRTRNS)
INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP(:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGPUV(:,:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP3A(:,:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP3B(:,:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP2(:,:,:)
INTEGER(KIND=JPIM), INTENT(IN) :: KSETAL(NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KSETBL(NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KSETWL(NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KSETVL(NPROC)

REAL(KIND=JPRD) :: ZCOMBUFS_STACK(-1:KSENDCOUNT,KNSEND)
REAL(KIND=JPRD) :: ZCOMBUFR_STACK(-1:KRECVCOUNT,KNRECV)

CALL TRLTOG_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,&
 & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,&
 & ZCOMBUFS_STACK,ZCOMBUFR_STACK, &
 & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2, &
 & KSETAL, KSETBL,KSETWL,KSETVL)

END SUBROUTINE TRLTOG_COMM_STACK

SUBROUTINE TRLTOG_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,&
 & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,&
 & PCOMBUFS,PCOMBUFR, &
 & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2, &
 & KSETAL, KSETBL,KSETWL,KSETVL)
 

!**** *trltog * - transposition of grid point data from latitudinal
!                 to column structure. This takes place between inverse
!                 FFT and grid point calculations.
!                 TRLTOG_COMM is the inverse of TRGTOL

!     Purpose.
!     --------

!**   Interface.
!     ----------
!        *call* *trltog(...)

!        Explicit arguments :
!        --------------------
!           PGLAT    -  Latitudinal data ready for direct FFT (input)
!           PGP    -  Blocked grid point data    (output)
!           KVSET    - "v-set" for each field      (input)

!        Implicit arguments :
!        --------------------

!     Method.
!     -------
!        See documentation

!     Externals.
!     ----------

!     Reference.
!     ----------
!        ECMWF Research Department documentation of the IFS

!     Author.
!     -------
!        MPP Group *ECMWF*

!     Modifications.
!     --------------
!        Original  : 95-10-01
!        D.Dent    : 97-08-04 Reorganisation to allow NPRTRV
!                             to differ from NPRGPEW
!        =99-03-29= Mats Hamrud and Deborah Salmond
!                   JUMP in FFT's changed to 1
!                   KINDEX introduced and PCOMBUF not used for same PE
!         01-11-23  Deborah Salmond and John Hague
!                   LIMP_NOOLAP Option for non-overlapping message passing
!                               and buffer packing
!         01-12-18  Peter Towers
!                   Improved vector performance of LTOG_PACK,LTOG_UNPACK
!         03-0-02   G. Radnoti: Call barrier always when nproc>1
!         08-01-01  G.Mozdzynski: cleanup
!         09-01-02  G.Mozdzynski: use non-blocking recv and send
!        R. El Khatib 09-Sep-2020 64 bits addressing for PGLAT
!     ------------------------------------------------------------------

USE EC_PARKIND  ,ONLY : JPIM     ,JPRD    ,JPIB
USE YOMHOOK   ,ONLY : LHOOK,   DR_HOOK, JPHOOK

USE MPL_MODULE  ,ONLY : MPL_RECV, MPL_SEND, MPL_WAIT, JP_NON_BLOCKING_STANDARD, MPL_WAITANY, &
     & JP_BLOCKING_STANDARD, MPL_BARRIER, JP_BLOCKING_BUFFERED

USE TPM_GEN         ,ONLY : NOUT, NTRANS_SYNC_LEVEL
USE TPM_DISTR       ,ONLY : D, MYSETV, MYSETW, MTAGLG,      &
     &                      NPRCIDS, NPRTRNS, MYPROC, NPROC
USE TPM_TRANS_dp       ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, NGPBLKS

USE PE2SET_MOD      ,ONLY : PE2SET
USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS

IMPLICIT NONE


INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS,KF_GP
REAL(KIND=JPRD),INTENT(IN)     :: PGLAT(KF_FS,D%NLENGTF)
INTEGER(KIND=JPIM), INTENT(IN) :: KVSET(KF_GP)
INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS_G
INTEGER(KIND=JPIM), INTENT(IN) :: KSENDCOUNT
INTEGER(KIND=JPIM), INTENT(IN) :: KRECVCOUNT
INTEGER(KIND=JPIM), INTENT(IN) :: KNSEND
INTEGER(KIND=JPIM), INTENT(IN) :: KNRECV
INTEGER(KIND=JPIM), INTENT(IN) :: KSENDTOT (NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KRECVTOT (NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KSEND    (NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KRECV    (NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KINDEX(D%NLENGTF)
INTEGER(KIND=JPIM), INTENT(IN) :: KNDOFF(NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KGPTRSEND(2,NGPBLKS,NPRTRNS)
REAL(KIND=JPRD), INTENT(INOUT) :: PCOMBUFS(-1:KSENDCOUNT,KNSEND)
REAL(KIND=JPRD), INTENT(INOUT) :: PCOMBUFR(-1:KRECVCOUNT,KNRECV)
INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP(:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGPUV(:,:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP3A(:,:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP3B(:,:,:,:)
REAL(KIND=JPRD),OPTIONAL,INTENT(OUT)     :: PGP2(:,:,:)
INTEGER(KIND=JPIM), INTENT(IN) :: KSETAL(NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KSETBL(NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KSETWL(NPROC)
INTEGER(KIND=JPIM), INTENT(IN) :: KSETVL(NPROC)

! LOCAL VARIABLES

INTEGER(KIND=JPIM) :: IPOSPLUS(KNRECV)
INTEGER(KIND=JPIM) :: ISETW(KNRECV)
INTEGER(KIND=JPIM) :: JPOS(NGPBLKS,KNRECV)
INTEGER(KIND=JPIM) :: IFLDA(KF_GP,KNRECV)
INTEGER(KIND=JPIM) :: IREQ_SEND(NPROC)
INTEGER(KIND=JPIM) :: IREQ_RECV(NPROC)

INTEGER(KIND=JPIM) :: IFIRST, IFLD, ILAST, IPOS, ISETA, ISETB, IRECV, ISETV
INTEGER(KIND=JPIM) :: ISEND, ITAG,  JBLK, JFLD, JK, JL, IFLDS, INR, INS
INTEGER(KIND=JPIM) :: II,ILEN, IFLDT, JI, JJ, J

INTEGER(KIND=JPIB) :: JFLD64

LOGICAL   :: LLPGPUV,LLPGP3A,LLPGP3B,LLPGP2,LLPGPONLY
LOGICAL   :: LLUV(KF_GP),LLGP2(KF_GP),LLGP3A(KF_GP),LLGP3B(KF_GP)
LOGICAL   :: LLINDER
INTEGER(KIND=JPIM) :: IUVLEVS(KF_GP),IUVPARS(KF_GP),IGP2PARS(KF_GP)
INTEGER(KIND=JPIM) :: IGP3APARS(KF_GP),IGP3ALEVS(KF_GP),IGP3BPARS(KF_GP),IGP3BLEVS(KF_GP)
INTEGER(KIND=JPIM) :: IUVPAR,IUVLEV,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF
INTEGER(KIND=JPIM) :: IOFF,IOFF1,IOFFNS,IOFFEW,J1,J2, JNR
INTEGER(KIND=JPIM) :: IFLDOFF(KF_FS)
INTEGER(KIND=JPIM) :: IRECV_FLD_START,IRECV_FLD_END
INTEGER(KIND=JPIM) :: ISEND_FLD_START,ISEND_FLD_END
INTEGER(KIND=JPIM) :: IGPTROFF(NGPBLKS)

REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR

!     ------------------------------------------------------------------

!*       0.    Some initializations
!              --------------------

ITAG   = MTAGLG

IF (LHOOK) CALL DR_HOOK('TRLTOG_BAR',0,ZHOOK_HANDLE_BAR)
CALL GSTATS_BARRIER(762)
IF (LHOOK) CALL DR_HOOK('TRLTOG_BAR',1,ZHOOK_HANDLE_BAR)

CALL GSTATS(805,0)

IF (NTRANS_SYNC_LEVEL <= 0) THEN
   !...Receive loop.........................................................
   DO INR=1,KNRECV
      IRECV=KRECV(INR)
      CALL MPL_RECV(PCOMBUFR(-1:KRECVTOT(IRECV),INR), &
           & KSOURCE=NPRCIDS(IRECV), &
           & KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IREQ_RECV(INR), &
           & KTAG=ITAG,CDSTRING='TRLTOG_COMM: NON-BLOCKING IRECV' )
   ENDDO
ENDIF

CALL GSTATS(805,1)

CALL GSTATS(1806,0)
LLINDER = .FALSE.
LLPGPUV = .FALSE.
LLPGP3A = .FALSE.
LLPGP3B = .FALSE.
LLPGP2  = .FALSE.
LLPGPONLY = .FALSE.
IF(PRESENT(KPTRGP))  LLINDER = .TRUE.
IF(PRESENT(PGP))     LLPGPONLY=.TRUE.
IF(PRESENT(PGPUV))   LLPGPUV=.TRUE.
IF(PRESENT(PGP3A))   LLPGP3A=.TRUE.
IF(PRESENT(PGP3B))   LLPGP3B=.TRUE.
IF(PRESENT(PGP2))    LLPGP2=.TRUE.

IUVPAR=0
IUVLEV=0
IOFF1=0
IOFFNS=KF_SCALARS_G
IOFFEW=2*KF_SCALARS_G

LLUV(:) = .FALSE.
IF (LLPGPUV) THEN
  IOFF=0
  IUVLEV=UBOUND(PGPUV,2)
  IF(LVORGP) THEN
    IUVPAR=IUVPAR+1
    DO J=1,IUVLEV
      IUVLEVS(IOFF+J)=J
      IUVPARS(IOFF+J)=IUVPAR
      LLUV(IOFF+J)=.TRUE.
    ENDDO
    IOFF=IOFF+IUVLEV
  ENDIF
  IF(LDIVGP) THEN
    IUVPAR=IUVPAR+1
    DO J=1,IUVLEV
      IUVLEVS(IOFF+J)=J
      IUVPARS(IOFF+J)=IUVPAR
      LLUV(IOFF+J)=.TRUE.
    ENDDO
    IOFF=IOFF+IUVLEV
  ENDIF
  DO J=1,IUVLEV
    IUVLEVS(IOFF+J)=J
    IUVPARS(IOFF+J)=IUVPAR+1
    IUVLEVS(IOFF+J+IUVLEV)=J
    IUVPARS(IOFF+J+IUVLEV)=IUVPAR+2
  ENDDO
  IUVPAR=IUVPAR+2
  LLUV(IOFF+1:IOFF+2*IUVLEV)=.TRUE.
  IOFF=IOFF+2*IUVLEV
  IOFF1=IOFF
  IOFFNS=IOFFNS+IOFF
  IOFFEW=IOFFEW+IOFF

  IOFF=IUVPAR*IUVLEV+KF_SCALARS_G
  IF(LUVDER) THEN
    IF(LSCDERS) IOFF=IOFF+KF_SCALARS_G
    DO J=1,IUVLEV
      IUVLEVS(IOFF+J)=J
      IUVPARS(IOFF+J)=IUVPAR+1
      LLUV(IOFF+J)=.TRUE.
      IUVLEVS(IOFF+J+IUVLEV)=J
      IUVPARS(IOFF+J+IUVLEV)=IUVPAR+2
      LLUV(IOFF+J+IUVLEV)=.TRUE.
    ENDDO
    IUVPAR=IUVPAR+2
    IOFF=IOFF+2*IUVLEV
    IOFFEW=IOFFEW+2*IUVLEV
  ENDIF
ENDIF

LLGP2(:)=.FALSE.
IF(LLPGP2) THEN
  IOFF=IOFF1
  IGP2PAR=UBOUND(PGP2,2)
  IF(LSCDERS) IGP2PAR=IGP2PAR/3
  DO J=1,IGP2PAR
    LLGP2(J+IOFF) = .TRUE.
    IGP2PARS(J+IOFF)=J
  ENDDO
  IOFF1=IOFF1+IGP2PAR
  IF(LSCDERS) THEN
    IOFF=IOFFNS
    DO J=1,IGP2PAR
      LLGP2(J+IOFF) = .TRUE.
      IGP2PARS(J+IOFF)=J+IGP2PAR
    ENDDO
    IOFFNS=IOFF+IGP2PAR
    IOFF=IOFFEW
    DO J=1,IGP2PAR
      LLGP2(J+IOFF) = .TRUE.
      IGP2PARS(J+IOFF)=J+2*IGP2PAR
    ENDDO
    IOFFEW=IOFF+IGP2PAR
  ENDIF
ENDIF

LLGP3A(:) = .FALSE.
IF(LLPGP3A) THEN
  IGP3ALEV=UBOUND(PGP3A,2)
  IGP3APAR=UBOUND(PGP3A,3)
  IF(LSCDERS) IGP3APAR=IGP3APAR/3
  IOFF=IOFF1
  DO J1=1,IGP3APAR
    DO J2=1,IGP3ALEV
      LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE.
      IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1
      IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2
    ENDDO
  ENDDO
  IPAROFF=IGP3APAR
  IOFF1=IOFF1+IGP3APAR*IGP3ALEV
  IF(LSCDERS) THEN
    IOFF=IOFFNS
    DO J1=1,IGP3APAR
      DO J2=1,IGP3ALEV
        LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE.
        IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1+IPAROFF
        IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2
      ENDDO
    ENDDO
    IPAROFF=IPAROFF+IGP3APAR
    IOFFNS=IOFFNS+IGP3APAR*IGP3ALEV
    IOFF=IOFFEW
    DO J1=1,IGP3APAR
      DO J2=1,IGP3ALEV
        LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE.
        IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1+IPAROFF
        IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2
      ENDDO
    ENDDO
    IOFFEW=IOFFEW+IGP3APAR*IGP3ALEV
  ENDIF
ENDIF

LLGP3B(:) = .FALSE.
IF(LLPGP3B) THEN
  IGP3BLEV=UBOUND(PGP3B,2)
  IGP3BPAR=UBOUND(PGP3B,3)
  IF(LSCDERS) IGP3BPAR=IGP3BPAR/3
  IOFF=IOFF1
  DO J1=1,IGP3BPAR
    DO J2=1,IGP3BLEV
      LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE.
      IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1
      IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2
    ENDDO
  ENDDO
  IPAROFF=IGP3BPAR
  IOFF1=IOFF1+IGP3BPAR*IGP3BLEV
  IF(LSCDERS) THEN
    IOFF=IOFFNS
    DO J1=1,IGP3BPAR
      DO J2=1,IGP3BLEV
        LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE.
        IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1+IPAROFF
        IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2
      ENDDO
    ENDDO
    IPAROFF=IPAROFF+IGP3BPAR
    IOFFNS=IOFFNS+IGP3BPAR*IGP3BLEV
    IOFF=IOFFEW
    DO J1=1,IGP3BPAR
      DO J2=1,IGP3BLEV
        LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE.
        IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1+IPAROFF
        IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2
      ENDDO
    ENDDO
    IOFFEW=IOFFEW+IGP3BPAR*IGP3BLEV
  ENDIF
ENDIF  

CALL GSTATS(1806,1)


! Copy local contribution
IF( KRECVTOT(MYPROC) > 0 )THEN
  IFLDS = 0
  DO JFLD=1,KF_GP
    IF(KVSET(JFLD) == MYSETV .OR. KVSET(JFLD) == -1) THEN
      IFLDS = IFLDS+1
      IF(LLINDER) THEN
        IFLDOFF(IFLDS) = KPTRGP(JFLD)
      ELSE
        IFLDOFF(IFLDS) = JFLD
      ENDIF
    ENDIF
  ENDDO

  IPOS=0
  DO JBLK=1,NGPBLKS
    IGPTROFF(JBLK)=IPOS
    IFIRST = KGPTRSEND(1,JBLK,MYSETW)
    IF(IFIRST > 0) THEN
      ILAST = KGPTRSEND(2,JBLK,MYSETW)
      IPOS=IPOS+ILAST-IFIRST+1
    ENDIF
  ENDDO

  CALL GSTATS(1604,0)
#ifdef __NEC__
! Loops inversion is still better on Aurora machines, according to CHMI. REK.
!$OMP PARALLEL DO SCHEDULE(DYNAMIC) PRIVATE(JFLD64,JBLK,JK,IFLD,IPOS,IFIRST,ILAST)
#else
!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JFLD64,JBLK,JK,IFLD,IPOS,IFIRST,ILAST)
#endif
  DO JBLK=1,NGPBLKS
    IFIRST = KGPTRSEND(1,JBLK,MYSETW)
    IF(IFIRST > 0) THEN
      ILAST = KGPTRSEND(2,JBLK,MYSETW)
! Address PGLAT over 64 bits because its size may exceed 2 GB for big data and
! small number of tasks.
      IF(LLPGPONLY) THEN
        DO JFLD64=1,IFLDS
          IFLD = IFLDOFF(JFLD64)
!DIR$ VECTOR ALWAYS
          DO JK=IFIRST,ILAST
            IPOS = KNDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1
            PGP(JK,IFLD,JBLK) = PGLAT(JFLD64,KINDEX(IPOS))
          ENDDO
        ENDDO
      ELSE
        DO JFLD64=1,IFLDS
          IFLD = IFLDOFF(JFLD64)
          IF(LLUV(IFLD)) THEN
!DIR$ VECTOR ALWAYS
            DO JK=IFIRST,ILAST
              IPOS = KNDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1
              PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) = PGLAT(JFLD64,KINDEX(IPOS))
            ENDDO
          ELSEIF(LLGP2(IFLD)) THEN
!DIR$ VECTOR ALWAYS
            DO JK=IFIRST,ILAST
              IPOS = KNDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1
              PGP2(JK,IGP2PARS(IFLD),JBLK)=PGLAT(JFLD64,KINDEX(IPOS))
            ENDDO
          ELSEIF(LLGP3A(IFLD)) THEN
!DIR$ VECTOR ALWAYS
            DO JK=IFIRST,ILAST
              IPOS = KNDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1
              PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK)=PGLAT(JFLD64,KINDEX(IPOS))
            ENDDO
          ELSEIF(LLGP3B(IFLD)) THEN
!DIR$ VECTOR ALWAYS
            DO JK=IFIRST,ILAST
              IPOS = KNDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1
              PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK)=PGLAT(JFLD64,KINDEX(IPOS))
            ENDDO
          ELSE
            WRITE(NOUT,*)'TRLTOG_MOD_dp: ERROR',JFLD64,IFLD
            CALL ABORT_TRANS('TRLTOG_MOD_dp: ERROR')
          ENDIF
        ENDDO
      ENDIF
    ENDIF
  ENDDO
!$OMP END PARALLEL DO
  CALL GSTATS(1604,1)

ENDIF

!
! loop over the number of processors we need to communicate with.
! NOT MYPROC
!
! Now overlapping buffer packing/unpacking with sends/waits
! Time as if all communications to avoid double accounting

CALL GSTATS(805,0)

!  Pack+send loop.........................................................

ISEND_FLD_START = 1
ISEND_FLD_END   = KF_FS
DO INS=1,KNSEND
  ISEND=KSEND(INS)
  ILEN = KSENDTOT(ISEND)/KF_FS
!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JFLD,JL,II)
  DO JL=1,ILEN
    II = KINDEX(KNDOFF(ISEND)+JL)
    DO JFLD=ISEND_FLD_START,ISEND_FLD_END
      PCOMBUFS((JFLD-ISEND_FLD_START)*ILEN+JL,INS) = PGLAT(JFLD,II)
    ENDDO
  ENDDO
!$OMP END PARALLEL DO
  PCOMBUFS(-1,INS) = 1
  PCOMBUFS(0,INS)  = KF_FS
  IF (NTRANS_SYNC_LEVEL <= 1) THEN
     CALL MPL_SEND(PCOMBUFS(-1:KSENDTOT(ISEND),INS),KDEST=NPRCIDS(ISEND),&
          & KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IREQ_SEND(INS), &
          & KTAG=ITAG,CDSTRING='TRLTOG_COMM: NON-BLOCKING ISEND')
  ELSE
     CALL MPL_SEND(PCOMBUFS(-1:KSENDTOT(ISEND),INS),KDEST=NPRCIDS(ISEND),&
          & KMP_TYPE=JP_BLOCKING_BUFFERED, &
          & KTAG=ITAG,CDSTRING='TRLTOG_COMM: BLOCKING BUFFERED BSEND')
  ENDIF
ENDDO

!  Unpack loop.........................................................

!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(INR,IRECV,ISETA,ISETB,ISETV,IFLD,JFLD,IPOS,JBLK,IFIRST,ILAST)
DO INR=1,KNRECV
  IRECV=KRECV(INR)
  
  ISETA=KSETAL(IRECV)
  ISETB=KSETBL(IRECV)
  ISETW(INR)=KSETWL(IRECV)
  ISETV=KSETVL(IRECV)
  IFLD = 0
  DO JFLD=1,KF_GP
    IF(KVSET(JFLD) == ISETV .OR. KVSET(JFLD) == -1 ) THEN
      IFLD = IFLD+1
      IFLDA(IFLD,INR)=JFLD
    ENDIF
  ENDDO
  IPOS = 0
  IPOSPLUS(INR)=0
  DO JBLK=1,NGPBLKS
    IFIRST = KGPTRSEND(1,JBLK,ISETW(INR))
    IF(IFIRST > 0) THEN
      ILAST = KGPTRSEND(2,JBLK,ISETW(INR))
      JPOS(JBLK,INR)=IPOS
      IPOSPLUS(INR)=IPOSPLUS(INR)+(ILAST-IFIRST+1)
      IPOS=IPOS+(ILAST-IFIRST+1)
    ENDIF
  ENDDO
ENDDO
!$OMP END PARALLEL DO

DO JNR=1,KNRECV
  
  IF (NTRANS_SYNC_LEVEL <= 0) THEN
     CALL MPL_WAITANY(KREQUEST=IREQ_RECV(1:KNRECV),KINDEX=INR,&
          & CDSTRING='TRLTOG_COMM: WAIT FOR ANY RECEIVES')
  ELSE
     INR = JNR
     IRECV=KRECV(INR)
     CALL MPL_RECV(PCOMBUFR(-1:KRECVTOT(IRECV),INR), &
          & KSOURCE=NPRCIDS(IRECV), &
          & KMP_TYPE=JP_BLOCKING_STANDARD, &
          & KTAG=ITAG,CDSTRING='TRLTOG_COMM: BLOCKING RECV' )
  ENDIF

  IPOS=IPOSPLUS(INR)
  IRECV_FLD_START = PCOMBUFR(-1,INR)
  IRECV_FLD_END   = PCOMBUFR(0,INR)
  
!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(IFLDT,IFIRST,ILAST,JK,JJ,JI,JBLK)
  DO JJ=IRECV_FLD_START,IRECV_FLD_END
    IFLDT=IFLDA(JJ,INR)
    DO JBLK=1,NGPBLKS
      IFIRST = KGPTRSEND(1,JBLK,ISETW(INR))
      IF(IFIRST > 0) THEN
        ILAST = KGPTRSEND(2,JBLK,ISETW(INR))
        IF(LLINDER) THEN
          DO JK=IFIRST,ILAST
            JI=(JJ-IRECV_FLD_START)*IPOS+JPOS(JBLK,INR)+JK-IFIRST+1
            PGP(JK,KPTRGP(IFLDT),JBLK) = PCOMBUFR(JI,INR)
          ENDDO
        ELSEIF(LLPGPONLY) THEN
          DO JK=IFIRST,ILAST
            JI=(JJ-IRECV_FLD_START)*IPOS+JPOS(JBLK,INR)+JK-IFIRST+1
            PGP(JK,IFLDT,JBLK) = PCOMBUFR(JI,INR)
          ENDDO
        ELSEIF(LLUV(IFLDT)) THEN
          DO JK=IFIRST,ILAST
            JI=(JJ-IRECV_FLD_START)*IPOS+JPOS(JBLK,INR)+JK-IFIRST+1
            PGPUV(JK,IUVLEVS(IFLDT),IUVPARS(IFLDT),JBLK) = PCOMBUFR(JI,INR)
          ENDDO
        ELSEIF(LLGP2(IFLDT)) THEN
          DO JK=IFIRST,ILAST
            JI=(JJ-IRECV_FLD_START)*IPOS+JPOS(JBLK,INR)+JK-IFIRST+1
            PGP2(JK,IGP2PARS(IFLDT),JBLK) = PCOMBUFR(JI,INR)
          ENDDO
        ELSEIF(LLGP3A(IFLDT)) THEN
          DO JK=IFIRST,ILAST
            JI=(JJ-IRECV_FLD_START)*IPOS+JPOS(JBLK,INR)+JK-IFIRST+1
            PGP3A(JK,IGP3ALEVS(IFLDT),IGP3APARS(IFLDT),JBLK) = PCOMBUFR(JI,INR)
          ENDDO
        ELSEIF(LLGP3B(IFLDT)) THEN
          DO JK=IFIRST,ILAST
            JI=(JJ-IRECV_FLD_START)*IPOS+JPOS(JBLK,INR)+JK-IFIRST+1
            PGP3B(JK,IGP3BLEVS(IFLDT),IGP3BPARS(IFLDT),JBLK) = PCOMBUFR(JI,INR)
          ENDDO
        ENDIF
      ENDIF
    ENDDO
  ENDDO
!$OMP END PARALLEL DO
ENDDO

IF (NTRANS_SYNC_LEVEL <= 1) THEN
   IF(KNSEND > 0) THEN
      CALL MPL_WAIT(KREQUEST=IREQ_SEND(1:KNSEND),CDSTRING='TRLTOG_COMM: WAIT FOR ISENDS')
   ENDIF
ENDIF

IF (NTRANS_SYNC_LEVEL >= 1) THEN
   CALL MPL_BARRIER(CDSTRING='TRLTOG_COMM: BARRIER AT END')
ENDIF

CALL GSTATS(805,1)

CALL GSTATS_BARRIER2(762)

END SUBROUTINE TRLTOG_COMM
END MODULE TRLTOG_MOD_dp
